/*******************************************************************************

Ryan Hill: ryan.hill@kellogg.northwestern.edu
Carolyn Stein: carolyn_stein@berkeley.edu
Last modified: December 2024

Inputs: 	db_target_all.csv (downloaded February 2020)
			db_target_pharmacologically_active.csv (downloaded February 2020)
			db_drug_index.csv (downloaded February 2020)

Outputs: 	clean_drugbank.dta

Note: This file cannot run without restricted data (see README)

*******************************************************************************/

clear all

* Clean up drug groups and create dummies (approved, experimental, etc.)
import delimited using "${data_raw}Restricted/db_drug_index.csv", clear 
keep drugbankid druggroups 
gen approved = regexm(subinstr(druggroups,"vet_approved","",.),"approved") 
gen experimental = regexm(druggroups,"experimental")
gen investigational = regexm(druggroups,"investigational")
drop druggroups
duplicates drop drugbankid, force
rename drugbankid drugid
replace drugid = trim(drugid)
tempfile groups
save `groups'

* Mark whether drugs are pharamcologically active or not
foreach vv in all pharmacologically_active {
	import delim using "${data_raw}Restricted/db_target_`vv'.csv", 			///
		delimiters(",") clear 

	* Create drug ID - PDB ID pairs and save
	keep pdbid drugids
	gen nn = _n
	split drugids, p(;)
	drop drugids
	split pdbid, p(;)
	drop pdbid
	greshape long pdbid, i(nn) j(jj) dropmiss
	drop nn jj
	gen nn = _n
	greshape long drugids, i(nn) j(jj) dropmiss	
	drop nn jj
	replace pdbid = trim(pdbid)
	replace drugids = trim(drugids)
	duplicates drop

	tempfile `vv'
	save ``vv''
	
}	

use `pharmacologically_active', clear
gen pharm_active = 1 

merge 1:1 drugids pdbid using `all'
assert _m != 1
drop _m

sort pdbid drugids
isid pdbid drugids
rename drugids drugid
	
replace pharm_active = 0 if pharm_active == .
order pdbid drugid
	
* Merge on drug groups (ie approval status)
merge m:1 drugid using `groups', keep(1 3) nogen

replace approved = 0 if approved == .
replace experimental = 0 if experimental == .
replace investigational = 0 if investigational == .

gen all = 1
gcollapse (sum) all pharm_active approved experimental investigational, by(pdbid)
isid pdbid
	
rename pdbid structureId
	
tempfile drugbank
save `drugbank'

* Get full list of structure IDs and replace missings with zeros	
use "${data_clean}clean_summary.dta", clear
keep structureId
duplicates drop
	
merge 1:1 structureId using `drugbank', keep(1 3) nogen

foreach vv in all pharm_active approved experimental investigational {
	rename `vv' drugbank_`vv'_count
	replace drugbank_`vv'_count = 0 if drugbank_`vv'_count == .
	gen drugbank_`vv'_dummy = drugbank_`vv'_count > 0
}
	
save "${data_clean}clean_drugbank.dta", replace
	
